In [1]:
import sys
sys.path.insert(0, "/Users/artur/workspace/recipedb")

In [2]:
from recipedb.db import get_db
import functools

In [3]:
db = get_db()

In [4]:
exs = db.allrecipes.find({"error":{"$eq":None}})

In [5]:
NUM_EXAMPLES = 30000
exs_recipes = list(map(lambda x: next(exs), range(NUM_EXAMPLES))) # grab the next NUM_EXAMPLES from cursor above

In [6]:
def write_list_file(file, lst):
    with open(file, "w+") as f:
        f.writelines(map(lambda x: "%s\n"%x,list(lst)))

In [7]:
# Lets try simple counts first and then lets see if tf idf improves it, first step is finding common ingredients
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [8]:
# add up all the ingredients into long corpus of ingredients
corpus = functools.reduce(lambda a,b: a+b, list(map(lambda x: x['data']['ingredients'], exs_recipes)))

In [114]:
vectorizer = CountVectorizer(analyzer='word',ngram_range=(1,5),)
tfidf = TfidfVectorizer(ngram_range=(1,5),)

In [115]:
vectorizer.fit(corpus)
tfidf.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 5), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [116]:
write_list_file('gram.txt', vectorizer.get_feature_names())

In [117]:
write_list_file("tfidf.txt", tfidf.get_feature_names())

In [118]:
def read_list_file(file):
    with open(file, "r") as f:
        data = f.readlines()
        return list(map(lambda x: x.strip(), data))

In [119]:
data = read_list_file('gram.txt')

In [120]:
# run through the list and remove the ones that occur in other ones
def remove_occurences(lst):
    i = 0
    lst = list(lst)
    while (len(lst)-1-i) > 0:
        current = lst[i] 
        nxt = lst[i+1]
        if current in nxt:
            lst.remove(current)
        else:
            i += 1
    return lst

In [121]:
cleaned_data = remove_occurences(data)

In [122]:
len(cleaned_data)/NUM_EXAMPLES

2.6674

In [123]:
counts = {}

for string in cleaned_data:
    for part in string.split():
        if counts.get(part) is None:
            counts[part] = 0
        counts[part] += 1
    

In [124]:
counts

{'10': 395,
 'inch': 574,
 'flour': 203,
 'tortillas': 49,
 '12': 375,
 '25': 121,
 'ounce': 3285,
 'can': 1291,
 'beef': 462,
 'condensed': 157,
 '75': 87,
 'beefy': 5,
 'chicken': 234,
 'cream': 448,
 'fat': 255,
 'low': 111,
 'milk': 297,
 'new': 47,
 'tomato': 263,
 'water': 220,
 'cans': 645,
 'loaf': 101,
 'day': 29,
 'package': 1366,
 'frozen': 644,
 'prepared': 139,
 'almonds': 71,
 'apples': 147,
 'peeled': 995,
 'cored': 218,
 'and': 2906,
 'baking': 81,
 'potatoes': 421,
 'cut': 1120,
 'bananas': 45,
 'mashed': 117,
 'frankfurters': 23,
 'black': 278,
 'olives': 228,
 'peppercorns': 31,
 'blue': 33,
 'crab': 47,
 'claws': 4,
 'steamed': 23,
 'buttery': 28,
 'round': 153,
 'crackers': 117,
 'crushed': 408,
 'caramels': 16,
 'carrots': 165,
 'into': 1192,
 'sliced': 1466,
 'cherries': 115,
 'with': 547,
 'stems': 85,
 'cherry': 88,
 'tomatoes': 487,
 'halved': 277,
 'chocolate': 374,
 'covered': 40,
 'almond': 35,
 'buttercrunch': 9,
 'mint': 69,
 'candies': 32,
 'coarsely': 1

In [125]:
counts_series = pd.Series(counts)


In [126]:
counts_series.sort_values(ascending=False).head(1000).to_csv('counts.csv')

In [127]:
with open('allrecipes_noningredient_words.txt', "r+") as f:
    annotated_data = list(map(lambda x: x.strip(), f.readlines()))

In [129]:
def remove_common_words(data_series):
    output = []
    for x in data_series:
        original_words = list(x.split())
        for word in x.split():
            if word in annotated_data or word.isdigit():
                original_words.remove(word)
        output.append(' '.join(original_words))
    return output

In [130]:
ingredients_set = set(remove_common_words(cleaned_data))

In [131]:
len(ingredients_set)

8527

In [132]:
with open("ingredient_set.txt", "w+") as f:
    f.writelines(map(lambda x: "%s\n"%x,list(ingredients_set)))


In [133]:
ingreds = list(ingredients_set)

In [134]:
ingreds.sort()

In [137]:
# https://en.wikipedia.org/wiki/Hamming_distance distance between two strings
# a more relative hamming distance
def hamming_score(string1, string2):
    dist_counter = 0
    max_val = min(len(string1),len(string2))
    for n in range(max_val):
        if string1[n] != string2[n]:
            dist_counter += 1
    if max_val is 0:
        return 0
    return dist_counter/max_val

In [138]:
THRESHOLD = 0.3
ingreds = list(ingredients_set)
ingreds.sort()
duplicates = []
for i in range(len(ingreds)-1):    
    if (hamming_score(ingreds[i], ingreds[i+1])) < THRESHOLD:
        print('matched')
        print(ingreds[i])
        print(ingreds[i+1])
        duplicates.append(ingreds[i]) # add first match to duplicates

matched

24x8x1 untreated cedar plank
matched
acini di
acini di pepe
matched
acini di pepe
acini di pepe pasta
matched
active
active yeast
matched
adobo sauce
adobo sauce from canned chipotle
matched
adobo sauce from canned chipotle
adobo sauce taste
matched
adobo seasoning
adobo seasoning taste
matched
albacore tuna
albacore tuna broken
matched
albacore tuna steaks
albacore tuna water
matched
ale
ale soda
matched
alfredo
alfredo pasta
matched
alfredo pasta
alfredo pasta sauce
matched
all purpose apples
all purpose apples cored
matched
all purpose baking mix
all purpose flour
matched
all purpose flour
all purpose flour for
matched
all purpose flour for
all purpose flour for coating
matched
all purpose flour for coating
all purpose flour for dredging
matched
all purpose flour for dredging
all purpose flour for dusting
matched
all purpose flour for dusting
all purpose flour for rolling
matched
all purpose flour for rolling
all purpose flour sifted
matched
allspice
allspice berries
matche

matched
chipotle peppers adobo
chipotle peppers adobo sauce
matched
chipotle peppers adobo sauce
chipotle peppers taste
matched
chipped beef
chips
matched
chips
chips chunks
matched
chive onion cream
chive onion cream cheese
matched
chive onion cream cheese
chives
matched
chocolate
chocolate almond liqueur
matched
chocolate almond liqueur
chocolate cake
matched
chocolate cake
chocolate cake crumbs
matched
chocolate cake crumbs
chocolate cake layers cooled
matched
chocolate cake layers cooled
chocolate cake mix
matched
chocolate candy bar
chocolate candy bar broken
matched
chocolate candy bar broken
chocolate candy kisses
matched
chocolate cereal puffs
chocolate chip
matched
chocolate chip
chocolate chip cookies
matched
chocolate chip ice cream
chocolate chips
matched
chocolate chips
chocolate chips melted
matched
chocolate cookie crumbs
chocolate covered
matched
chocolate covered
chocolate covered almond buttercrunch
matched
chocolate covered almond buttercrunch
chocolate covered almon

matched
dill pickles
dill pickles patted
matched
dill pickles patted
dill pickles wedges
matched
dill weed
dill weed taste
matched
dinner
dinner roll
matched
dinner roll
dinner roll dough
matched
dinner roll dough
dinner rolls
matched
dinner rolls
dinner rolls split
matched
dish pie crust
dish pie crusts
matched
dog links
dogs
matched
dogs
dogs beef pork frankfurters
matched
dough
dough baked
matched
down backbone
down backbone heavy
matched
dr pepper
dr pepper soft
matched
dr pepper soft
dr pepper soft drink
matched
dressed
dressing
matched
dressing
dressing ie fat free miracle
matched
dressing miracle whip
dressing mix
matched
dressing mix
dressing more taste
matched
drink mix
drink mix crystal
matched
drink mix powder
drink mix tang
matched
duck
duck blood
matched
dungeness crab
dungeness crabmeat
matched
each
each down
matched
each down
each down backbone
matched
each neck
each neck giblets
matched
each neck giblets
each neck giblets discarded
matched
eagle brand
eagle brand sweete

ice cream topping
matched
ice cubes
ice cubes an ice
matched
iceberg
iceberg lettuce
matched
iceberg lettuce
iceberg lettuce bite
matched
iceberg lettuce bite
iceberg lettuce julienned
matched
idaho
idaho dice
matched
if drying toasting
if drying toasting bread
matched
imitation crab legs
imitation crab meat flaked
matched
imitation crab meat flaked
imitation crabmeat
matched
imitation crabmeat
imitation crabmeat flaked
matched
included
including some leaves
matched
individually wrapped
individually wrapped caramels
matched
individually wrapped caramels
individually wrapped caramels unwrapped
matched
instant banana pudding
instant banana pudding mix
matched
instant butterscotch pudding
instant butterscotch pudding mix
matched
instant cheddar cheese sauce
instant cheddar cheese sauce mix
matched
instant chocolate drink mix
instant chocolate pudding
matched
instant chocolate pudding
instant chocolate pudding mix
matched
instant cocoa
instant coconut cream
matched
instant coconut cream
in

mushroom cap
matched
mushroom cap
mushroom caps
matched
mushroom gravy
mushroom gravy mix
matched
mushroom soup
mushroom soup mix
matched
mushroom soy sauce
mushroom stems
matched
mushroom stems
mushrooms
matched
mushrooms
mushrooms cleaned
matched
mushrooms stemmed
mushrooms stems
matched
mussels
mussels cleaned
matched
mussels cleaned
mussels cleaned debearded
matched
mussels scrubbed
mussels scrubbed debearded
matched
mustard
mustard mayonnaise
matched
mustard mayonnaise
mustard mayonnaise blend
matched
mustard sauce
mustard seed
matched
mustard seed
mustard seeds
matched
nacho cheese dip
nacho cheese soup
matched
nacho flavor
nacho flavor tortilla
matched
nacho flavor tortilla
nacho flavor tortilla chips
matched
navel oranges
navy
matched
navy
navy beans
matched
navy beans
navy beans juice
matched
neck giblets
neck giblets discarded
matched
neck giblets liver from
neck giblets reserved
matched
nectarine
nectarines
matched
nestle
nestle carnation
matched
nestle carnation
nestle carn

ready use strawberry
ready use strawberry glaze
matched
recipe pastry for
recipe pastry for double
matched
recipe pastry for double
recipe pastry for double crust
matched
recipe pastry for double crust
recipe pastry for single
matched
red
red anjou pears
matched
red apple
red apple cored
matched
red beans
red beans liquid
matched
red beans liquid
red beets
matched
red beets
red bell
matched
red bell
red bell pepper
matched
red bell pepper
red bell pepper cored
matched
red bell pepper cored
red bell pepper julienned
matched
red bell pepper julienned
red bell peppers
matched
red bell peppers
red bell peppers julienned
matched
red bell peppers julienned
red bell peppers roasted
matched
red chile pepper
red chile peppers
matched
red chile peppers
red chile powder
matched
red chile powder
red chile sauce
matched
red chili pepper
red chili peppers
matched
red chili peppers
red chili peppers pounded
matched
red currant jelly
red currants
matched
red decorator
red decorator sugar
matched
red d

stilton cheese crumbed
stilton cheese crumbled
matched
stir fry mix
stir fry vegetables
matched
stock
stock pan drippings
matched
stout
stout beer
matched
strained chicken baby
strained chicken baby food
matched
strawberries
strawberries chunks
matched
strawberries chunks
strawberries cleaned hulled
matched
strawberries hulled
strawberries leaves
matched
strawberries leaves
strawberries mashed
matched
strawberries mashed
strawberries partially
matched
strawberries partially
strawberries syrup
matched
strawberries syrup
strawberry
matched
strawberry
strawberry cheesecake yogurt
matched
strawberry cheesecake yogurt
strawberry cream
matched
strawberry cream
strawberry cream cheese
matched
strawberry daiquiri
strawberry daiquiri mixer
matched
strawberry danish dessert
strawberry danish dessert mix
matched
strawberry danish dessert mix
strawberry danish dessert mix junket
matched
strawberry ice cream
strawberry ice cream softened
matched
strawberry ice cream softened
strawberry jam
matched


yellow apple
yellow apples cored
matched
yellow bell pepper
yellow bell pepper julienned
matched
yellow bell pepper julienned
yellow bell peppers
matched
yellow cake
yellow cake mix
matched
yellow chili pepper peruvian
yellow corn
matched
yellow corn
yellow cornmeal
matched
yellow mustard
yellow mustard taste
matched
yellow onion
yellow onion wedges
matched
yellow onion wedges
yellow onions
matched
yellow onions
yellow onions wedges
matched
yellow squash
yellow squash julienned
matched
yellow wax
yellow wax beans
matched
yogurt
yogurt covered raisins
matched
yolk beaten water
yolks
matched
yolks
yolks beaten
matched
zest
zest for garnish
matched
zested
zested juiced
matched
zesty italian
zesty italian dressing
matched
zesty italian dressing
zesty italian salad
matched
zesty italian salad
zesty italian salad dressing
matched
ziti
ziti pasta
matched
zucchini
zucchini chunks
matched
zucchini rounds
zucchini then
matched
zucchini yellow
zucchini yellow summer
matched
zucchini yellow summer

In [139]:
cleaned_ingredients_set = list(ingredients_set)
for dupe in duplicates:
    cleaned_ingredients_set.remove(dupe)
cleaned_ingredients_set.sort()


In [140]:
write_list_file('ingredients_set.txt', cleaned_ingredients_set)

In [141]:
def term_counts(ingredients_set):
    counts = {}

    for string in ingredients_set:
        for part in string.split():
            if counts.get(part) is None:
                counts[part] = 0
            counts[part] += 1
    return counts

In [142]:
counts = term_counts(cleaned_ingredients_set)
counts_series = pd.Series(counts)
counts_series.sort_values(ascending=False).head(1000).to_csv('counts.csv')