In [1]:
import numpy as np
import pandas as pd

import pickle
import sys
import string
import unidecode
import scipy.sparse
from scipy.sparse.csr import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils


Using TensorFlow backend.


In [2]:
#!aws s3 cp s3://RecipeVectors/sparse_recipe_ingredient_matrix.npz .

In [4]:
# https://stackoverflow.com/a/8980156/2491761
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

recipe_ingredient_matrix = load_sparse_csr("sparse_recipe_ingredient_matrix.npz")

In [5]:
print type(recipe_ingredient_matrix)
print recipe_ingredient_matrix.size
print recipe_ingredient_matrix.nnz
print recipe_ingredient_matrix.shape

<class 'scipy.sparse.csr.csr_matrix'>
959104
959104
(89061, 19013)


In [6]:
svd = TruncatedSVD(n_components=50)
reduced_recipe_ingredient_matrix = svd.fit_transform(recipe_ingredient_matrix)

print type(reduced_recipe_ingredient_matrix)
print reduced_recipe_ingredient_matrix.size

#print recipe_ingredient_matrix.toarray().size
#print reduced_recipe_ingredient_matrix.toarray().size

<type 'numpy.ndarray'>
4453050


In [10]:
#dense_term_matrix = recipe_ingredient_matrix.toarray()
#KxMxN = np.stack(recipe_ingredient_matrix.toarray(), reduced_recipe_ingredient_matrix)
#print KxMxN.size

In [None]:
#!aws s3 cp s3://RecipeVectors/unique_ingredients.pkl .

In [11]:
with open('unique_ingredients.pkl', 'rb') as f:
    unique_ingredients = pickle.load(f)

In [None]:
#!aws s3 cp s3://RecipeVectors/CleanedIngredients.pkl .

In [6]:
df = pd.read_pickle('CleanedIngredients.pkl')

In [7]:
def translate_non_alphanumerics(to_translate, translate_to=u''):
    # https://stackoverflow.com/a/1324274/2491761
    not_letters = u'!"#%\'()&*+,-./:;<=>?@[\]^_`{|}~0123456789'
    if isinstance(to_translate, unicode):
        #print "unicode instance"
        translate_table = dict((ord(char), unicode(translate_to)) for char in not_letters)
    else:
        #print "not unicode instance"
        assert isinstance(to_translate, str)
        #translate_table = string.maketrans(not_letters, translate_to *len(not_letters))
        return to_translate.translate(None, string.punctuation + string.digits)
    return to_translate.translate(translate_table)

def extract_ingredients(full_ingred_list, strip_punct=True):
    # From: https://en.wikibooks.org/wiki/Cookbook:Units_of_measurement
    meas_units = [
        'teaspoon', 'teaspoons',
        't',
        'tsp', 'tsps',
        'tablespoon','tablespoons',
        'tbl',
        'tbs',
        'tbsp', 'tbsps',
        'fl', 'fluid',
        'oz', 'ozs',
        'ounce', 'ounces',
        'cup', 'cups',
        'c',
        'pint', 'pints',
        'p',
        'pt',
        'quart', 'quarts',
        'qt', 'qts',
        'q', 'qs',
        'gallon', 'gallons',
        'gal', 'gals',
        'ml',
        'milliliter', 'milliliters',
        'millilitre', 'millilitres',
        'cc',
        'l',
        'liter', 'liters',
        'litre', 'litres',
        'pinch', 'pinches',        
        'pound', 'pounds',
        'lb', 'lbs',
        'mg', 'mgs',
        'milligram', 'milligrams',
        'milligramme', 'milligrammes',
        'g', 'gs',
        'gram', 'grams',
        'gramme', 'grammes',
        'kg', 'kgs',
        'kilogram', 'kilograms',
        'kilogramme', 'kilogrammes',
        'mm', 'mms',
        'millimeter', 'millimeters',
        'millimetre', 'millimetres',
        'cm', 'cms',
        'centimeter', 'centimeters',
        'centimetre', 'centimetres',
        'm', 'ms',
        'meter', 'meters',
        'metre', 'metres',
        'inch', 'inches',
        'in', 'ins',
        ###
        'loaf', 'loaves',
        'pouch', 'pouches',
        'wedge', 'wedges',
        'drop', 'drops',
        'amount', 'amounts',
        'bulk', 'bulks',
        'coating', 'coatings',
        'carton', 'cartons',
        'count',
        'dusting', 'dustings',
        'roll', 'rolls',
        'hint', 'hints',
        'round', 'rounds',
        'cube', 'cubes',
        'husk', 'husks',
        'envelope', 'envelopes',
        'container', 'containers',
        'dash', 'dashes',
        'bitesize', 'bitesized',
        'bite', 'sized',
        'size',
        'each',
        'taste', 'desired',
        'can', 'cans',
        'unit', 'units',
        'box', 'boxes',
        'tub', 'tubs',
        'slab', 'slabs',
        'sprig', 'sprigs',
        'stalk', 'stalks',
        'matchstick', 'matchsticks',
        'balls',
        'clove', 'cloves',
        'slice', 'slices',
        'head', 'heads',
        'spear', 'spears',
        'chunk', 'chunks',
        'piece', 'pieces',
        'jar', 'jars',
        'package', 'packages',
        'pack', 'packs',
        'packet', 'packets',
        'bunch', 'bunches',
        'tube', 'tubes',
        'jug', 'jugs',
        'bottle', 'bottles',
        'stick', 'sticks',
        'strip', 'strips',
        'bag', 'bags',
        'dash', 'dashes',
        'container', 'containers',
        'envelope', 'envelopes',
        'rounds',
        'sheet', 'sheets',
        'squares',
        'whole',
        'extra', 'extras',
        'dozen', 'dozens',
        'half', 'halves',
        'third', 'thirds',
        'quarter', 'quarters',
        'fifth', 'fifths',
        'eighth', 'eighths',
    ]
    
    preparatory_descriptions = [
        'baked',
        'beat', 'beaten', 'wellbeaten',
        'blanched',
        'blended',
        'boiled',
        'bottled',
        'broiled',
        'broken',
        'browned',
        'canned',
        'chilled',
        'chipped',
        'chopped',
        'cleaned',
        'converted',
        'cooked',
        'cooled',
        'cored',
        'crumbled',
        'crushed',
        'cubed',
        'cut',
        'defrosted',
        'deshelled',
        'desilked',
        'deveined',
        'diced', 'dice',
        'dissolved',
        'divided',
        'drained',
        'dried',
        'filleted',
        'filtered',
        'flaked',
        'fresh',
        'fried',
        'frozen',
        'grated',
        'grilled',
        'ground',
        'halved',
        'hardened',
        'heated',
        'hulled',
        'husked',
        'jarred',
        'juiced',
        'julienned', 'julienne', 'juliennecut', 'juliennesliced',
        'kneaded', 'kneading',
        'marinated',
        'mashed',
        'matchstickcut',
        'melted',
        'microwaved',
        'minced',
        'mixed',
        'opened',
        'packed',
        'peeled',
        'pitted',
        'prepared',
        'pressed',
        'prewashed',
        'processed',
        'quartered',
        'raw',
        'refrigerated',
        'removed',
        'rinsed',
        'ripe',
        'ripened',
        'ripped',
        'roasted',
        'rolled', 'rolling',
        'salted',
        'saved',
        'scored',
        'scrubbed',
        'seasoned',
        'sectioned',
        'seeded',
        'separated',
        'shaved',
        'shelled',
        'shredded',
        'shucked',
        'sifted',
        'smashed',
        'smoked',
        'snipped',
        'sliced',
        'slivered',
        'softened',
        'split',
        'squeezed',
        'steamed',
        'stemmed',
        'soaked',
        'sweetened',
        'thawed',
        'toasted',
        'torn',
        'trimmed',
        'uncored',
        'uncooked',
        'undrained',
        'unhusked',
        'unopened',
        'unpeeled',
        'unsalted',
        'unsweetened',
        'unwashed',
        'use', 'used',
        'warmed',
        'washed',
        'zested',
    ]
    
    other_words_to_filter = [
        # articles:
        'a', 'an', 'the',
        # adverbs:
        'all',
        'approximately',
        'coarsely',
        'crosswise',
        'diagonally', 
        'finely',
        'freshly',
        'horizontally',
        'left',
        'lengthwise', 
        'lightly',
        'like',
        'overnight',
        'piecewise',
        'plus',
        'right',
        'roughly',
        'sideways',
        'slightly',
        'stiffly',
        'then',
        'thickly',
        'thinly',
        'vertically',
        'very',
        'widthwise', 
        # adjectives:
        'according',
        'allpurpose',
        'another',
        'any',
        'certified',
        'classic',
        'coarse', 
        'cold',
        'discarded',
        'dry',
        'extralarge',
        'extrasmall', 
        'fine', 
        'firm',
        'fresh',
        'hard',
        'hot',
        'jumbo',
        'large', 
        'lean',
        'leftover',
        'less',
        'long',
        'medium',
        'more',
        'multicolored',
        'natural',
        'needed',
        'only',
        'optional',
        'organic',
        'original',
        'other',
        'pure',
        'real',
        'rough',
        'short',
        'small',
        'stale',
        'soft',
        'such',
        'thick',
        'thickcut',
        'thicklycut',
        'thicklysliced',
        'thicksliced',
        'thin',
        'thincut',
        'thinlycut',
        'thinlysliced',
        'thinsliced',
        # prepositions:
        'about',
        'across',
        'against',
        'apart',
        'as',
        'at',
        'by',
        'for',
        'from',
        'in',
        'into',
        'of',
        'on',
        'to',
        'with',
        'without',
        # conjunctions:
        'and',
        'if',
        'or',
        # abbreviations:
        'eg', 'ie',
        # nouns:
        'directions',
        'hand',
        'ingredients',
        'list',
        'purpose',
        'room',
        'temperature',
        # verbs:
        'add',
        'including',
    ]
    
    #print "Before..."
    #print full_ingred_list
    
    # strip punctuation chars
    # all lowercase
    if strip_punct:
        cleaned_ingred_list = [translate_non_alphanumerics(s).lower() for s in full_ingred_list]
    else:
        cleaned_ingred_list = full_ingred_list
    #print cleaned_ingred_list
    
    # split on whitespace
    # throw out meas units
    #print
    #print "After..."
    if isinstance(cleaned_ingred_list[0], unicode):
        ingreds_only = [' '.join(w for w in unidecode.unidecode(ingred_item).split() if w not in meas_units and w not in preparatory_descriptions and w not in other_words_to_filter)
             for ingred_item in cleaned_ingred_list
             ]
    else:
        assert isinstance(cleaned_ingred_list[0], str)
        ingreds_only = [' '.join(w for w in ingred_item.split() if w not in meas_units and w not in preparatory_descriptions and w not in other_words_to_filter)
             for ingred_item in cleaned_ingred_list
             ]
    ingreds_only = filter(None, ingreds_only) # remove empty strings at end
    #print ingreds_only
    
    brand_names = [
        'a(r)',
        'accent(r)',
        'adams(r)',
        'al fresco(r)',
        'accent(r)',
        'annies lemon chive dressing(r)',
        'archer farms(tm)',
        'backdraft fire sauce(r)',
        'barilla(r)',
        'beau monde (tm)',
        'beechers(r) flagship', 'beechers(r)',
        'beer shiner(r)',
        'best foods(r)',
        'better than bouillon(r)',
        'birds eye(r)',
        'bisquick(r)',
        'bisquick (r)',
        'borden(r)',
        'bragg(r)',
        'briannas home style(r)', 'briannas(r)',
        'brooks(r)',
        'bruces(r)',
        'budweiser(r)',
        'buitoni(r)',
        'bumble bee(r)',
        'bushs best(r)', 'bushs homestyle(r)', 'bushs chili magic(r)', 'bushs(r)',
        'butterball(r)',
        'butter buds(r)',
        'campari(r)',
        'campbells(r) healthy request', 'campbells(r)',
        'carolina pride(r)',
        'carroll shelbys(r)',
        'catelli bistro(r)',
        'cavenders(r)',
        'chambord(r)',
        'chef paul prudhommes salmon magic(r)',
        'cheese whiz(r)', 'cheez whiz(r)',
        'cholula(r)',
        'clamato(r)',
        'classicmac(tm)',
        'classico (tm)',
        'classico (r)',
        'classico(r)',
        'claussen(r)',
        'coke(r)',
        'cointreau(r)',
        'college inn(r)',
        'contadina(r)',
        'cool whip(r)',
        'coors(r)',
        'cornnuts (r)',
        'corona(r)',
        'craisins(r)',
        'crazy steves(tm) cajun cukes', 'crazy steves(r)',
        'crisco(r)',
        'daisy(r)',
        'dei fratelli(r)',
        'del monte(r)',
        'diamond crystal(r)',
        'dickinsons(r)',
        'dole(r)',
        'dole asian island crunch(tm)',
        'dole(r) veggie(tm)',
        'doritos(r)',
        'dubliner(r)',
        'dukes(r)',
        'durkee(r)', 'famous sauce(r)',
        'eagle brand(r)',
        'earth balance(r)',
        'eckrich(r)',
        'edward sons(r)',
        'egg beaters(r)',
        'el pato(r)',
        'embasa(r)',
        'emerald(r)',
        'equal(r)',
        'farmland(r)',
        'fat tire(r)',
        'fiesta blend(r)',
        'follow your heart(r) veganaise(r)', 'follow your heart(r)',
        'foster farms(r)',
        'franks(r) redhot', 'franks red hot (r)', 'franks redhot (r)', 'franks redhot(r)', 'redhot(r)', 'franks(r)',
        'frenchs(r)',
        'frichik(r)',
        'fritos(r)',
        'frontera(r)',
        'goya(r)',
        'gebhardt(r)',
        'girards(r) olde venice',
        'good seasons(r)',
        'gourmet garden(tm)',
        'grapenuts(tm)',
        'great american spice co(tm)',
        'green giant(r)',
        'guinness(r)',
        'harp(r)',
        'hatch(r)',
        'healthy choice(r)',
        'heineken(r)', 
        'heinz (r)', 'heinz(r)',
        "hellmann's(r) best foods(r)", 'hellmanns(r) best foods(r)', 'hellmanns(r)best foods(r)',
        'hellmanns light(r)', 'hellmans(r) light', 'hellmanns(r) low fat',
        'hellmans(r)', "hellmann's(r)",
        'hidden valley ranch(r)', 'hidden valley(r) original ranch(r)',
        'hillshire farm(r)',
        'holland house(r)',
        'horizon(r)',
        'hormel(r)', 'hormel(tm)',
        'house tsang(r)',
        'hunts(r)',
        'idahoan(r)','baby reds(r)', 'buttery homestyle(r)',
        'imagine(r)', 'imagine(tm)',
        'jack daniels(r)',
        'jello(r)',
        'jennieo(r)',
        'jiffy(r)',
        'jim beam(r)',
        'jimmy dean(r)',
        'johnsonville(r)',
        'johnnys seasoning salt(r)',
        'kame(r)',
        'karys(r)',
        'kc masterpiece(r)',
        'keebler club(r)', 'keebler(r)',
        'keens(r)',
        'kens steak house(r)',
        'kens steak house lite northern italian dressing(r)',
        'kerrygold(r)',
        'kewpie(r)',
        'king arthur(r)',
        'king oscar(r)',
        'kirkland(r)',
        'kitchen basics(r)',
        'kitchen bouquet(r)',
        'kitchen secrets(r)',
        'klondike gourmet(r)', 'klondike goldust(r)', 'klondike(r)',
        'knorr(r) rice sides(tm)', 'knorr(r) fiesta sides(tm)', 'knorr(r)', 'knorrs(r)',
        'knox (r)',
        'kraft(r)',
        'laphroiag(r)',
        'laughing cow(r)',
        'lawrys(r)',
        'libbys(r)',
        'lightlife(r) organic smoky tempeh strips(r)', 'lightlife(r)',
        'lipton(r) recipe secrets(r)', 'lipton(r)',
        'locatelli(r)',
        'loma linda(r)',
        'lotus(r)',
        'louisiana(r)',
        'maggi(r)',
        'mahatma(r)',
        'maifun(r)',
        'maille(r)',
        'malibu(r)',
        'marie callenders(r)',
        'maries(r)',
        'market pantry(tm)',
        'marzettis(r)',
        'maui(r)',
        'mazola(r)',
        'mccormick chili seasoning mix sodium(r)', 'mccormick(r)',
        'mexene(r)',
        'mexenes(r)',
        'mexicorn(r)',
        'miller lite(r)',
        'mimiccreme(r)',
        'minute(r)',
        'miracle whip(r)',
        'miracle whip(tm)',
        'miracle whip (tm)',
        'miracle whip free(r)',
        'miracle whip light(r)', 'light miracle whip(r)',
        'montreal steak seasoning(r)',
        'morningstar farms(r) grillers recipe crumbles(r)', 'morningstar farms(r) recipe crumbles(r)', 'morningstar farms(r)',
        'morton(r) natures seasons(r)', 'morton(r)',
        'motts(r)',
        'mountain dew(r)',
        'mrs butterworths(r)',
        'mrs dash(r)',
        'muir glen(r)',
        'nabisco(r)',
        'natural goodness(tm)',
        'negra modelo(r)',
        'newcastle(r)',
        'newmans own(r)',
        'newmans own(r) lighten up(r)', 'lighten up(r)',
        'niblets(r)',
        'nissin(r) top ramen',
        'no yolks(r)',
        'ocean spray(r)',
        'old bay(r)', 'old bay (tm)', 'old bay(tm)',
        'old el paso(r)',
        'olive garden(r)',
        'oreida(r) steam n mash(r)', 'oreida(r)', 'ore ida(r)',
        'oreo(r)',
        'ortega(r)',
        'ottogi ramyonsari(r)', 'ottogi(r)',
        'quorn(tm)',
        'pace(r)',
        'pam(r)',
        'pataks(r)',
        'penzeys(r)', 'penzys(r)',
        'pepperidge farm(r)',
        'perdue(r)',
        'pernod(r)',
        'perrins(r)',
        'pillsbury grands(r)',
        'planters(r)',
        'plochmans(r)',
        'progresso(r)',
        'ragu(r) old world style(r)', 'ragu(r)',
        'rapunzel(r)',
        'ready rice(r)',
        'realime(r)',
        'red gold(r)',
        'renees(r)',
        'reynolds wrap(r)', 'reynolds(r)',
        'rice sides(tm)',
        'ronzoni(r)',
        'rotel(r)',
        'saco(r)',
        'sambazon',
        'samuel smith(r)',
        'sargento(r)',
        'sason accent(r)',
        'sb(r) golden curry', 'sb(r)',
        'seasonall(r)',
        'shiner bock(r)',
        'silk(r)',
        'simple crisp(tm)',
        'simply balanced(tm)',
        'simply potatoes(r)',
        'slap ya mama(r)',
        'smithfield(r)',
        'snack factory(r)',
        'snickers(r)',
        'so delicious(r)',
        'southern comfort (r)',
        #'spam(r)',
        'spectrum(r)',
        'spice islands(r)',
        'spike(r)',
        'splenda(r)',
        'sriracha(r)',
        'steam whistle pilsner(r)',
        'sunsweet(r) dnoir(tm)',
        'sutton dodge(tm)',
        'swanson(r) vegetable flavor boost(r)', 'swanson(r)', 'natural goodness(r)',
        'sweet baby rays(r)',
        'sweetn low(r)',
        'tabasco(r)', 'tabasco(tm)',
        'taco bell(r)',
        'tajin(r)',
        'tapatio(r)',
        'tater tots(r)',
        'texas pete(r)', 'texas petess(r)',
        'thai kitchen(r)',
        'three bridges(r)',
        'tony chacheres(r)',
        'top ramen(r)',
        'trader joes(r)',
        'truroots(r)',
        'truvia(r)',
        'uncle bens (r)', 'uncle bens(r)',
        'v(r)', # what remains from "V8"
        'valentina(r)',
        'vegenaise(r)',
        'veggie(tm)',
        #'velveeta(r)',
        'vidalia(r)',
        'vietti(r)',
        'vindaloo sauce maya kaimal(r)', 'vindaloo(r)',
        'voskos(r)',
        'wesson(r)',
        'white lily(r)',
        'wishbone(r)',
        'wondra(r)',
        'yoplait(r)',
        'yuengling(r)',
        'yves veggie cuisine(r)',
        'zatarains(r)',
        'zing zang(r)',
    ]
    
    def remove_brands(ingred_str):
        str_without_brand = ingred_str
        for bn in brand_names:
            str_without_brand = str_without_brand.replace(bn, '')
        return str_without_brand
    
    ingreds_without_brands = [' '.join(remove_brands(ingred_str).strip().split()) for ingred_str in ingreds_only]
    
    return ingreds_without_brands



In [8]:
ingreds_arr_of_lists = np.squeeze(df[['ingredients']].values)
vect_extract_ingred = np.vectorize(extract_ingredients, otypes=[list])

processed_ingreds_arr_of_lists = vect_extract_ingred(ingreds_arr_of_lists)

#print type(processed_ingreds_arr_of_lists)
print "Processed ingredients from {} recipes".format(len(processed_ingreds_arr_of_lists))
#print processed_ingreds_arr_of_lists.size


Processed ingredients from 89061 recipes


In [9]:
flattened_ingred_list = np.hstack(processed_ingreds_arr_of_lists)
print len(flattened_ingred_list)
#print flattened_ingred_list

unique_vocab = sorted(list(set(flattened_ingred_list)))
#print unique_vocab
print "{} unique ingredients found".format(len(unique_vocab))

851920
18996 unique ingredients found


In [10]:
flat_ingred_arr = [' '.join(lst) for lst in processed_ingreds_arr_of_lists]

cv = CountVectorizer(vocabulary=unique_vocab)
recipe_ingredient_matrix = cv.fit_transform(flat_ingred_arr)

print type(recipe_ingredient_matrix)
print recipe_ingredient_matrix.size
print recipe_ingredient_matrix.shape

<class 'scipy.sparse.csr.csr_matrix'>
959104


In [11]:
zip(cv.get_feature_names(),
    np.asarray(recipe_ingredient_matrix.sum(axis=0)).ravel())

[('', 0),
 ('(r) condensed cream celery soup', 0),
 ('(r) condensed cream mushroom soup', 0),
 ('abalone shell', 0),
 ('absinthe', 4),
 ('acai berry pulp', 0),
 ('acai berry sorbet', 0),
 ('acai powder', 0),
 ('acaipomegranate juice', 0),
 ('accents(r) sprouted quinoa trio', 0),
 ('accents(tm) sprouted rice trio', 0),
 ('accompaniments', 2),
 ('acesulfame potassium sweetener', 0),
 ('achar masala', 0),
 ('achiote annatto seeds', 0),
 ('achiote paste', 0),
 ('achiote powder', 0),
 ('achiote seed', 0),
 ('achiote seeds', 0),
 ('acini de pepe pasta', 0),
 ('acini di pepe pasta', 0),
 ('acorn squash', 0),
 ('act ii(r) fat free popcorn', 0),
 ('active yeast', 0),
 ('active yeast bread machine yeast', 0),
 ('active yeast fleischmanns activedry yeast(r)', 0),
 ('active yeast fleischmanns(r) rapidrise yeast', 0),
 ('addins', 2),
 ('additional butter margarine', 0),
 ('additional butter sugar garnish', 0),
 ('additional cheddar cheese', 0),
 ('additional cocktail sauce dipping', 0),
 ('addition

In [34]:
with open('ingreds_arr_of_lists.pkl', 'wb') as f:
    pickle.dump(ingreds_arr_of_lists, f)

with open('processed_ingreds_arr_of_lists.pkl', 'wb') as f:
    pickle.dump(processed_ingreds_arr_of_lists, f)

with open('flattened_ingred_list.pkl', 'wb') as f:
    pickle.dump(flattened_ingred_list, f)
    
with open('flat_ingred_arr.pkl', 'wb') as f:
    pickle.dump(flat_ingred_arr, f)

In [35]:
#ingred_cooccurrence_matrix = np.dot(recipe_ingredient_matrix.transpose(), recipe_ingredient_matrix)
#ingred_cooccurrence_matrix.setdiag(0)

ingred_cooccurrence_matrix = (recipe_ingredient_matrix.T * recipe_ingredient_matrix)

In [36]:
#ingred_cooccurrence_matrix.todense().size

In [39]:
print ingred_cooccurrence_matrix.nnz
print ingred_cooccurrence_matrix.size
print ingred_cooccurrence_matrix.shape

147621
147621
(18996, 18996)


In [40]:
print type(ingred_cooccurrence_matrix)

# https://stackoverflow.com/a/8980156/2491761
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

save_sparse_csr('ingred_cooccurrence_matrix', ingred_cooccurrence_matrix) #csc matrix also apparently works

<class 'scipy.sparse.csc.csc_matrix'>


In [41]:
!aws s3 cp ingred_cooccurrence_matrix.npz s3://RecipeVectors

upload: ./ingred_cooccurrence_matrix.npz to s3://RecipeVectors/ingred_cooccurrence_matrix.npz


In [42]:
svd2 = TruncatedSVD(n_components=50)
reduced_cooccurrence_matrix = svd2.fit_transform(ingred_cooccurrence_matrix)

print type(reduced_cooccurrence_matrix)
print reduced_cooccurrence_matrix.size
print reduced_cooccurrence_matrix.shape

<type 'numpy.ndarray'>
949800
(18996, 50)


In [43]:
with open('reduced_cooccurrence_matrix.pkl', 'wb') as f:
    pickle.dump(reduced_cooccurrence_matrix, f)

In [44]:
!aws s3 cp reduced_cooccurrence_matrix.pkl s3://RecipeVectors

upload: ./reduced_cooccurrence_matrix.pkl to s3://RecipeVectors/reduced_cooccurrence_matrix.pkl


In [None]:
def makePipeline():
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', KMeans(n_clusters=2, init='k-means++', max_iter=100, n_init=1))
    ])
    return pipeline

def trainModel(X, Y):
    model = makePipeline()
    model.fit(X, Y)
    return model

X = [' '.join(f) for f in df['ingredients'].values]
Y = df['categories'].values   # Roy: I think this needs to change to a 

model = trainModel(X, Y)

In [None]:
true_k = len(np.unique(Y))
true_k

In [None]:
preds = model.predict(X)

In [None]:
uniq = np.unique(Y)
pred_labeled = [uniq[1] if p == 0 else uniq[0] for p in preds]
print confusion_matrix(pred_labeled, Y)

In [None]:
print classification_report(pred_labeled, Y)

In [None]:
print("Top terms per cluster:")

vectorizer = model.named_steps['vect']
km = model.named_steps['clf']
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print "Cluster %d:" % i
    for ind in order_centroids[i, :10]:
        print ' %s' % terms[ind]
    print


In [None]:
def getMultiClassData():
    filename2 = './recipeVectors/allRecipes_recipes.json'
#     filename2 = '../../data/cleandata/sunbasket_noapp.csv'
    df = pd.read_json(filename2)
    return df

def concatIngredients(arr):
    return ','.join(arr).encode('ascii', 'ignore')


def getTopCategory(arr):
    if arr == arr and len(arr) > 0:
        return arr[0].encode('ascii', 'ignore')

    return None


def processKerasModel(df_):
    embedding_length = 100
    top_words = 10000
    df = df_.copy()
    df['features'] = df['ingredients'].apply(concatIngredients)
    df['label'] = df['categories'].apply(getTopCategory)
    df = df[df['label'].astype(str) != 'nan']
    le = preprocessing.LabelEncoder()
    features = df['features'].values
    labels = df['label'].values
    
    le.fit(np.unique(labels))
    print list(le.classes_), 'num of labels:',len(np.unique(labels))
    labels = le.transform(labels) 
    print labels[:10]
#     X_train, Y_train, X_test, Y_test = ut.simpleSplit(features, labels)

    tokenizer = Tokenizer(nb_words=top_words)
    tokenizer.fit_on_texts(features)
    sequences = tokenizer.texts_to_sequences(features)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=embedding_length)

#     labels = np_utils.to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    # split the data into a training set and a validation set
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(0.2 * data.shape[0])

    X_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    X_test = data[-nb_validation_samples:]
    y_test = labels[-nb_validation_samples:]
    
    embedding_vecor_length = 100
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=embedding_length))
#     model.add(Dropout(0.5))
    model.add(LSTM(100)) #, return_sequences=True))
    model.add(Dropout(0.4))
#     model.add(Dense(1, activation='sigmoid'))
    model.add(Dense(len(np.unique(labels)), activation='softmax'))
    
    model.compile(loss='sparse_categorical_crossentropy',
#         loss='sparse_categorical_crossentropy',
#               optimizer='rmsprop',
                optimizer='adam',
#               loss='sparse_categorical_crossentropy',
                metrics=['acc'])
#               metrics=['accuracy'])
#     model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
    print(model.summary())
    
    model.fit(X_train, y_train, epochs=10, batch_size=100)
    # Final evaluation of the model
    return model, X_test, y_test
finalset = getMultiClassData()

print finalset.sample(5)

dl_model, X_test, y_test = processKerasModel(finalset)
scores = dl_model.evaluate(X_test, y_test, verbose=1)  
print scores


