In [1]:
import sys
sys.path.insert(0, "/Users/artur/workspace/recipedb")

In [2]:
from recipedb.db import get_db
import functools

In [3]:
db = get_db()

In [4]:
exs = db.allrecipes.find({"error":{"$eq":None}})

In [5]:
NUM_EXAMPLES = 10000
exs_recipes = list(map(lambda x: next(exs), range(NUM_EXAMPLES))) # grab the next NUM_EXAMPLES from cursor above

In [33]:
# Lets try simple counts first and then lets see if tf idf improves it, first step is finding common ingredients
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [7]:
# add up all the ingredients into long corpus of ingredients
corpus = functools.reduce(lambda a,b: a+b, list(map(lambda x: x['data']['ingredients'], exs_recipes)))

In [8]:
vectorizer = CountVectorizer(analyzer='word',ngram_range=(1,5),)
tfidf = TfidfVectorizer(ngram_range=(1,5),)

In [9]:
vectorizer.fit(corpus)
tfidf.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 5), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [10]:
with open("gram.txt", "w+") as f:
    f.writelines(map(lambda x: "%s\n"%x,vectorizer.get_feature_names()))

In [11]:
with open("tfidf.txt", "w+") as f:
    f.writelines(map(lambda x: "%s\n"%x,tfidf.get_feature_names()))

In [12]:
with open("gram.txt", "r") as f:
    data = f.readlines()

In [13]:
data = list(map(lambda x: x.strip(), data))

In [14]:
# run through the list and remove the ones that occur in other ones
def remove_occurences(lst):
    i = 0
    lst = list(lst)
    while (len(lst)-1-i) > 0:
        current = lst[i] 
        nxt = lst[i+1]
        if current in nxt:
            lst.remove(current)
        else:
            i += 1
    return lst

In [15]:
cleaned_data = remove_occurences(data)

In [16]:
len(cleaned_data)

26674

In [17]:
len(cleaned_data)/len(exs_recipes)

2.6674

In [20]:
with open("cleaned.txt", "w+") as f:
    f.writelines(map(lambda x: "%s\n"%x,cleaned_data))


In [30]:
counts = {}

for string in cleaned_data:
    for part in string.split():
        if counts.get(part) is None:
            counts[part] = 0
        counts[part] += 1
    

In [31]:
counts

{'10': 395,
 'inch': 574,
 'flour': 203,
 'tortillas': 49,
 '12': 375,
 '25': 121,
 'ounce': 3285,
 'can': 1291,
 'beef': 462,
 'condensed': 157,
 '75': 87,
 'beefy': 5,
 'chicken': 234,
 'cream': 448,
 'fat': 255,
 'low': 111,
 'milk': 297,
 'new': 47,
 'tomato': 263,
 'water': 220,
 'cans': 645,
 'loaf': 101,
 'day': 29,
 'package': 1366,
 'frozen': 644,
 'prepared': 139,
 'almonds': 71,
 'apples': 147,
 'peeled': 995,
 'cored': 218,
 'and': 2906,
 'baking': 81,
 'potatoes': 421,
 'cut': 1120,
 'bananas': 45,
 'mashed': 117,
 'frankfurters': 23,
 'black': 278,
 'olives': 228,
 'peppercorns': 31,
 'blue': 33,
 'crab': 47,
 'claws': 4,
 'steamed': 23,
 'buttery': 28,
 'round': 153,
 'crackers': 117,
 'crushed': 408,
 'caramels': 16,
 'carrots': 165,
 'into': 1192,
 'sliced': 1466,
 'cherries': 115,
 'with': 547,
 'stems': 85,
 'cherry': 88,
 'tomatoes': 487,
 'halved': 277,
 'chocolate': 374,
 'covered': 40,
 'almond': 35,
 'buttercrunch': 9,
 'mint': 69,
 'candies': 32,
 'coarsely': 1

In [45]:
counts_series = pd.Series(counts)


In [51]:
counts_series.sort_values(ascending=False).head(1000).to_csv('counts.csv')

In [61]:
with open('allrecipes_noningredient_words.txt', "r+") as f:
    annotated_data = list(map(lambda x: x.strip(), f.readlines()))

In [62]:
annotated_data

['ounce',
 'and',
 'chopped',
 'cup',
 'sliced',
 'package',
 'can',
 'cups',
 'into',
 'cut',
 'fresh',
 'ounces',
 'drained',
 'peeled',
 'diced',
 'or',
 'pound',
 'tablespoons',
 'optional',
 'cans',
 'frozen',
 'inch',
 'with',
 'dry',
 'tablespoon',
 'shredded',
 'flavored',
 'cubed',
 'dried',
 'pounds',
 'large',
 'fluid',
 'teaspoon',
 'cooked',
 'crushed',
 'finely',
 'to',
 'ground',
 'rinsed',
 'teaspoons',
 'in',
 'of',
 'as']

In [69]:
def remove_common_words(data_series):
    output = []
    for x in data_series:
        original_words = list(x.split())
        for word in x.split():
            if word in annotated_data or word.isdigit():
                original_words.remove(word)
        output.append(' '.join(original_words))
    return output

In [71]:
ingredients_set = set(remove_common_words(cleaned_data))

In [73]:
len(ingredients_set)

11934