In [1]:
import pandas as pd
import numpy as np
import json
import pickle

##text pre-processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist

##feature extraction + similarity
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import NMF




## Concatenate json files and create dataframe

In [2]:
def concatJson(file_list):
    full = []
    with open("allRecipes.json", "w") as outfile:
        for f in file_list:
            with open(f, 'rb') as infile:
                file_data = json.load(infile)
                full += file_data
        json.dump(full, outfile)

In [3]:
def openJson(name):
    with open(name, 'rb') as infile:
        full = json.load(infile)
    return full

In [4]:
file_list = ["fullhelping_items.json", "iLoveVegan_items.json","MinimalistBaker_items.json", "ohsheglows_items.json",
             "PaleOMG_items.json","roastedRoot_items.json","ohmyveggies_items.json",]

In [5]:
concatJson(file_list)
full = openJson('allRecipes.json')

In [6]:
fullDF = pd.DataFrame(full)
fullDF1 = fullDF[(fullDF.ingredientsList.str.len() != 0)]
recipeDF = fullDF1.reset_index(drop=True)

In [7]:
recipeDF.to_pickle('recipeDF_v3')

In [8]:
recipeDF = pd.read_pickle('recipeDF_v3')

##Write ingredients to file format for NYT ingredient tagger

In [9]:
def nytIngredientInput(df):
    f = open("input.txt", "a")
    ingredientList = []
    for row in range(0,len(df)):
        ingredients = df.ingredientsList.iloc[row]
        for i in range(0,len(ingredients)):
            line = ingredients[i].encode('ascii','ignore')
            ingredientList.append([row,i,line])
            f.write(line + ' /' + str(row) + ' ' + '\n')
    f.close()   
    return ingredientList

In [10]:
ingredientList = nytIngredientInput(recipeDF)

In [11]:
ingredientDF = pd.DataFrame(ingredientList, columns=['recipeIndex','ingredientIndex','recipeText'])

In [12]:
recipeDF.to_pickle('ingredientDF_v1')

In [13]:
recipeDF = pd.read_pickle('ingredientDF_v1')

## Merge NYT ingredient tags with ingredientDF
<p>(see NYT_CRF_IngredientPhraseTagger.txt for tags) </p>

In [13]:
ingredientTags = openJson('results_6-21.json')
ingredientTagsDF = pd.DataFrame(ingredientTags)
ingredientTagsDF.to_pickle('ingredientTagsDF')

In [15]:
ingredientTaggedDF = pd.concat([ingredientDF,ingredientTagsDF], axis = 1)

In [24]:
def mergedIndex(df):

    df['mergedrecipeIndex'] = 'NA'

    for i in range(0,len(df)): 
        try:
            text = str(df.name.loc[i])
            df.name.loc[i] = text.rsplit(' ', 1)[0]
            df.mergedrecipeIndex.loc[i] = int(text.rsplit(' ', 1)[1])
        except:
            pass        

In [None]:
ingredientTaggedDF = mergedIndex(ingredientTaggedDF)

In [17]:
ingredientTaggedDF.to_pickle('ingredientTaggedDF_v4')

In [4]:
ingredientTaggedDF = pd.read_pickle('ingredientTaggedDF_v3')

## Create Ingredient Documents

In [158]:
ingredientTaggedDF['name'] = ingredientTaggedDF['name'].astype(str)

In [159]:
def ingredientTokens(df,taggeddf):
    df['ingredientTokens'] = 0
    for i in range(0,len(taggeddf.recipeIndex.unique())):
        ingredients = taggeddf.name[taggeddf.recipeIndex == i]
        df['ingredientTokens'][i] = str(' '.join(ingredients))  
    df['ingredientTokens'] = df['ingredientTokens'].astype(str)
    
    return df

In [30]:
def nlpTokens(document):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)
    
    tokens = [PorterStemmer().stem(x) for x in tokens]
    tokens = [WordNetLemmatizer().lemmatize(x, 'v') for x in tokens]
    
    stop = stopwords.words('english')
    tokens = [x for x in tokens if x not in stop] 
    return ','.join(tokens)

In [167]:
recipeDF = ingredientTokens(recipeDF,ingredientTaggedDF)
recipeDF.loc[:, 'ingredientTokens'] = recipeDF.loc[:, 'ingredientTokens'].apply(nlpTokens)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [170]:
recipeDF.to_pickle('recipeDF_tokens_v3')

In [3]:
recipeDF = pd.read_pickle('recipeDF_tokens_v3')

## TFIDF vectorizer

In [4]:
stopList = ('tbsp','From')

In [5]:
def tfidfVectorizer(stopList,df):
    
    tfidf_vectorizer = TfidfVectorizer(max_df=.2, max_features=1300, min_df=0.01, stop_words= stopList, use_idf=True, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(df.ingredientTokens)
    terms = sorted([(i,v) for v,i in tfidf_vectorizer.vocabulary_.items()])
    vocab = [x[1] for x in terms]
    
    return tfidf_vectorizer,tfidf_matrix,terms,vocab

In [6]:
tfidf_vectorizer,tfidf_matrix,terms,vocab = tfidfVectorizer(['nan','tbsp','tsp',','],recipeDF)

##Non-Negative Matrix Factorization

In [7]:
U, Sigma, VT = randomized_svd(tfidf_matrix, n_components=25,
                                      n_iter=5,
                                      random_state=None)

In [8]:
Sigma

array([ 11.70401137,   9.22868086,   6.49565664,   6.27835743,
         6.09854453,   5.69469634,   5.6498461 ,   5.48071641,
         5.3851342 ,   5.31626579,   5.19163403,   5.03017515,
         4.98309644,   4.91513431,   4.85479782,   4.69007801,
         4.64749497,   4.62764541,   4.51477365,   4.42153623,
         4.40361155,   4.27058744,   4.25877967,   4.17779088,   4.1084884 ])

In [9]:
def NMFactorize(tfidf_matrix,n):
    NMFmodel = NMF(n_components=n, init='random', random_state=0)
    NMF_matrix = NMFmodel.fit_transform(tfidf_matrix)
    
    return NMFmodel,NMF_matrix

In [10]:
NMFmodel,NMF_matrix = NMFactorize(tfidf_matrix,15) ##chose 15 components since eigen vector largely falls under general food types

In [13]:
pickle.dump(NMFmodel, open( "NMFmodel_v6.p", "wb"))
pickle.dump(NMF_matrix, open( "NMFmatrix_v6.p", "wb"))