In [8]:
import numpy as np
import pandas as pd

import pickle
import sys

from scipy.sparse.csr import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils


Using TensorFlow backend.


In [6]:
#!aws s3 cp s3://RecipeVectors/sparse_recipe_ingredient_matrix.npz .

In [4]:
# https://stackoverflow.com/a/8980156/2491761
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

recipe_ingredient_matrix = load_sparse_csr("sparse_recipe_ingredient_matrix.npz")

In [5]:
svd = TruncatedSVD(n_components=50)
reduced_recipe_ingredient_matrix = svd.fit_transform(recipe_ingredient_matrix)

print type(reduced_recipe_ingredient_matrix)
print reduced_recipe_ingredient_matrix.size

<type 'numpy.ndarray'>
6669250


In [None]:
#!aws s3 cp s3://RecipeVectors/unique_ingredients.pkl .

In [9]:
with open('unique_ingredients.pkl', 'rb') as f:
    unique_ingredients = pickle.load(f)

In [None]:
#!aws s3 cp s3://RecipeVectors/CleanedIngredients.pkl .

In [7]:
df = pd.read_pickle('CleanedIngredients.pkl')

In [None]:
def makePipeline():
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', KMeans(n_clusters=2, init='k-means++', max_iter=100, n_init=1))
    ])
    return pipeline

def trainModel(X, Y):
    model = makePipeline()
    model.fit(X, Y)
    return model

X = [' '.join(f) for f in df['ingredients'].values]
Y = df['categories'].values   # Roy: I think this needs to change to a 

model = trainModel(X, Y)

In [None]:
true_k = len(np.unique(Y))
true_k

In [None]:
preds = model.predict(X)

In [None]:
uniq = np.unique(Y)
pred_labeled = [uniq[1] if p == 0 else uniq[0] for p in preds]
print confusion_matrix(pred_labeled, Y)

In [None]:
print classification_report(pred_labeled, Y)

In [None]:
print("Top terms per cluster:")

vectorizer = model.named_steps['vect']
km = model.named_steps['clf']
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print "Cluster %d:" % i
    for ind in order_centroids[i, :10]:
        print ' %s' % terms[ind]
    print


In [None]:
def getMultiClassData():
    filename2 = './recipeVectors/allRecipes_recipes.json'
#     filename2 = '../../data/cleandata/sunbasket_noapp.csv'
    df = pd.read_json(filename2)
    return df

def concatIngredients(arr):
    return ','.join(arr).encode('ascii', 'ignore')


def getTopCategory(arr):
    if arr == arr and len(arr) > 0:
        return arr[0].encode('ascii', 'ignore')

    return None


def processKerasModel(df_):
    embedding_length = 100
    top_words = 10000
    df = df_.copy()
    df['features'] = df['ingredients'].apply(concatIngredients)
    df['label'] = df['categories'].apply(getTopCategory)
    df = df[df['label'].astype(str) != 'nan']
    le = preprocessing.LabelEncoder()
    features = df['features'].values
    labels = df['label'].values
    
    le.fit(np.unique(labels))
    print list(le.classes_), 'num of labels:',len(np.unique(labels))
    labels = le.transform(labels) 
    print labels[:10]
#     X_train, Y_train, X_test, Y_test = ut.simpleSplit(features, labels)

    tokenizer = Tokenizer(nb_words=top_words)
    tokenizer.fit_on_texts(features)
    sequences = tokenizer.texts_to_sequences(features)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=embedding_length)

#     labels = np_utils.to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    # split the data into a training set and a validation set
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(0.2 * data.shape[0])

    X_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    X_test = data[-nb_validation_samples:]
    y_test = labels[-nb_validation_samples:]
    
    embedding_vecor_length = 100
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=embedding_length))
#     model.add(Dropout(0.5))
    model.add(LSTM(100)) #, return_sequences=True))
    model.add(Dropout(0.4))
#     model.add(Dense(1, activation='sigmoid'))
    model.add(Dense(len(np.unique(labels)), activation='softmax'))
    
    model.compile(loss='sparse_categorical_crossentropy',
#         loss='sparse_categorical_crossentropy',
#               optimizer='rmsprop',
                optimizer='adam',
#               loss='sparse_categorical_crossentropy',
                metrics=['acc'])
#               metrics=['accuracy'])
#     model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
    print(model.summary())
    
    model.fit(X_train, y_train, epochs=10, batch_size=100)
    # Final evaluation of the model
    return model, X_test, y_test
finalset = getMultiClassData()

print finalset.sample(5)

dl_model, X_test, y_test = processKerasModel(finalset)
scores = dl_model.evaluate(X_test, y_test, verbose=1)  
print scores


