In [45]:
import pickle
import sys
import string
import unidecode

import string
from collections import defaultdict, OrderedDict
import operator
import re
import pickle
import six # needed for Google Cloud client

import numpy as np
import pandas as pd
import scipy.sparse
import scipy.sparse
from scipy.sparse.csr import csr_matrix

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

In [2]:
df = pd.read_pickle('CleanedIngredients.pkl')

In [3]:
with open('reduced_cooccurrence_matrix.pkl', 'rb') as f:
    reduced_cooccurrence_matrix = pickle.load(f)

In [11]:
print len(df)
#df.categories.str.join(' ').str.contains(u'Italian Recipes') #apply(' '.join()) #str.contains(u'Italian Recipes')

89061


In [26]:
def make_class_labels(df):
    categories_to_use = [
        u'Asian Recipes', # 6736 occurrences
        u'Italian Recipes', # 4964 occurrences
        u'Breakfast and Brunch', # 4351 occurrences
        u'Mexican Recipes', # 4063 occurrences
        u'Drinks', # 3997 occurrences
    ]
    y = np.zeros((len(df),len(categories_to_use)), dtype=bool)
    
    for idx, cat_to_use in enumerate(categories_to_use):
        y[:,idx] = inthiscat = df.categories.str.join(' ').str.contains(cat_to_use)
        #print "shape of this slice of y: {}".format(y[:,idx].shape)
    
    return y

In [27]:
y = make_class_labels(df)

In [30]:
from IPython.display import display
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
np.set_printoptions(threshold=np.nan)

#print y

In [29]:
np.sum(y, axis=0)

array([6736, 4964, 4995, 4063, 3999])

In [18]:
y.shape

(89061, 5)

In [38]:
# find/count rows that are in none of the categories:

no_categories = np.where(~y.any(axis=1))[0]
print "{} examples do not belong in any of the categories".format(len(no_categories))

65681 examples do not belong in any of the categories


In [32]:
def load_sparse_csr(filename):
    # https://stackoverflow.com/a/8980156/2491761
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [34]:
sparse_recipe_ingredient_matrix = load_sparse_csr('sparse_recipe_ingredient_matrix.npz')

In [39]:
X_train, X_test, y_train, y_test = train_test_split(sparse_recipe_ingredient_matrix, y, test_size=0.25, random_state=42)

In [40]:
categories_to_use = [
    u'Asian Recipes', # 6736 occurrences
    u'Italian Recipes', # 4964 occurrences
    u'Breakfast and Brunch', # 4351 occurrences
    u'Mexican Recipes', # 4063 occurrences
    u'Drinks', # 3997 occurrences
]


In [42]:
def print_metrics(true_y, predicted_y, target_names, y_score=None):
    """ Prints classification metrics
    Args:
        true_y: The ground truth target labels
        predicted_y: The predicted labels from the classifier
        y_score: If not None, this is vector of probability scores for positive class (used for roc_curve) (Optional)
    Returns:
        None
    """
    print classification_report(true_y, predicted_y, target_names=target_names)
    cm = confusion_matrix(true_y, predicted_y)
    print "Confusion matrix:"
    print cm
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print
    print "Confusion matrix(normalized):"
    print cm_normalized
    print
    
    print 'Overall accuracy: {}'.format(accuracy_score(true_y, predicted_y))
    print
    if y_score is not None:
        print "Area Under the ROC Curve: {}".format(roc_auc_score(true_y, y_score))
        print

In [43]:
def single_category_pipeline(X_train, X_test, y_train_single_col, y_test_single_col, target_names):
    svd = TruncatedSVD()
    svd_components = [5,10,20,50,100]
    
    clf_knn = KNeighborsClassifier()
    n_neighbors = [1, 3, 5, 7]
    
    #clf_lr = LogisticRegression(random_state=1, tol=0.01)
    #lr_c = []
    
    #clf_rf = RandomForestClassifier(criterion="entropy")
    #rf_n_estimators = [20,50,100,140,200] #range(5,140+5,5)
    
    pipe = Pipeline(steps=[
        ('decomp', svd),
        ('knn', clf_knn)
    ])
    
    # find the best combination of SVD components and number of neighbors:
    
    #params = dict(decomp__n_components = svd_components,)
    
    knn_cv_estimator = GridSearchCV(pipe, 
                                dict(
                                    decomp__n_components = svd_components,
                                    knn__n_neighbors = n_neighbors
                                ), 
                                cv=3, verbose=2, n_jobs=2, scoring='roc_auc')
    
    knn_cv_estimator.fit(X_train, y_train_single_col)
    
    print 'SVD and KNeighborsClassifier - Best parameters set found on training set:'
    print knn_cv_estimator.best_params_
    
    print "Grid scores:"
    # https://stackoverflow.com/a/42800056/2491761
    means = knn_cv_estimator.cv_results_['mean_test_score']
    stds = knn_cv_estimator.cv_results_['std_test_score']

    for mean, std, params in zip(means, stds, knn_cv_estimator.cv_results_['params']):
        print "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)
    
    pred_y = knn_cv_estimator.predict(X_test)
    proba_y = knn_cv_estimator.predict_proba(X_test)
    print_metrics(y_test_single_col, pred_y, target_names, proba_y[:,1])
    
    return knn_cv_estimator

In [46]:
models_for_each_category = []
for idx, cat_name in enumerate(categories_to_use):
    target_names=('Not '+cat_name, cat_name)
    print "About to run test for: {}".format(target_names)
    model = single_category_pipeline(X_train, X_test, y_train[:,idx], y_test[:,idx], target_names)
    models_for_each_category.append(model)

About to run test for: (u'Not Asian Recipes', u'Asian Recipes')
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   1.9s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 172.1min finished


SVD and KNeighborsClassifier - Best parameters set found on training set:
{'knn__n_neighbors': 7, 'decomp__n_components': 100}
                   precision    recall  f1-score   support

Not Asian Recipes       0.96      0.99      0.97     20557
    Asian Recipes       0.77      0.51      0.61      1709

      avg / total       0.95      0.95      0.95     22266

Confusion matrix:
[[20288   269]
 [  831   878]]

Confusion matrix(normalized):
[[ 0.98691443  0.01308557]
 [ 0.48624927  0.51375073]]

Overall accuracy: 0.950597323273

Area Under the ROC Curve: 0.944155659841

About to run test for: (u'Not Italian Recipes', u'Italian Recipes')
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 155.1min finished


SVD and KNeighborsClassifier - Best parameters set found on training set:
{'knn__n_neighbors': 7, 'decomp__n_components': 100}
                     precision    recall  f1-score   support

Not Italian Recipes       0.96      0.99      0.97     20984
    Italian Recipes       0.60      0.34      0.43      1282

        avg / total       0.94      0.95      0.94     22266

Confusion matrix:
[[20691   293]
 [  846   436]]

Confusion matrix(normalized):
[[ 0.98603698  0.01396302]
 [ 0.6599064   0.3400936 ]]

Overall accuracy: 0.948845773826

Area Under the ROC Curve: 0.911885431765

About to run test for: (u'Not Breakfast and Brunch', u'Breakfast and Brunch')
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 142.4min finished


SVD and KNeighborsClassifier - Best parameters set found on training set:
{'knn__n_neighbors': 7, 'decomp__n_components': 100}
                          precision    recall  f1-score   support

Not Breakfast and Brunch       0.96      0.99      0.97     20999
    Breakfast and Brunch       0.62      0.34      0.44      1267

             avg / total       0.94      0.95      0.94     22266

Confusion matrix:
[[20735   264]
 [  834   433]]

Confusion matrix(normalized):
[[ 0.98742797  0.01257203]
 [ 0.65824783  0.34175217]]

Overall accuracy: 0.950687146322

Area Under the ROC Curve: 0.899527631883

About to run test for: (u'Not Mexican Recipes', u'Mexican Recipes')
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 147.9min finished


SVD and KNeighborsClassifier - Best parameters set found on training set:
{'knn__n_neighbors': 7, 'decomp__n_components': 100}
                     precision    recall  f1-score   support

Not Mexican Recipes       0.97      0.99      0.98     21273
    Mexican Recipes       0.62      0.37      0.47       993

        avg / total       0.96      0.96      0.96     22266

Confusion matrix:
[[21046   227]
 [  621   372]]

Confusion matrix(normalized):
[[ 0.9893292   0.0106708 ]
 [ 0.62537764  0.37462236]]

Overall accuracy: 0.961915027396

Area Under the ROC Curve: 0.922134251565

About to run test for: (u'Not Drinks', u'Drinks')
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.4s
[CV] knn__n_neighbors=1, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=1, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=3, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=3, decomp__n_components=5, total=   0.5s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] ....... knn__n_neighbors=5, decomp__n_components=5, total=   0.6s
[CV] knn__n_neighbors=5, decomp__n_components=5 ......................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 150.8min finished


SVD and KNeighborsClassifier - Best parameters set found on training set:
{'knn__n_neighbors': 7, 'decomp__n_components': 100}
             precision    recall  f1-score   support

 Not Drinks       0.99      0.99      0.99     21297
     Drinks       0.77      0.78      0.77       969

avg / total       0.98      0.98      0.98     22266

Confusion matrix:
[[21065   232]
 [  211   758]]

Confusion matrix(normalized):
[[ 0.98910645  0.01089355]
 [ 0.21775026  0.78224974]]

Overall accuracy: 0.980104194736

Area Under the ROC Curve: 0.985849012489



In [48]:
def single_category_pipeline_rf(X_train, X_test, y_train_single_col, y_test_single_col, target_names):
    svd = TruncatedSVD()
    svd_components = [50,100,200]
    
    clf_rf = RandomForestClassifier()
    rf_n_estimators = [50,100,200]
    
    #clf_lr = LogisticRegression(random_state=1, tol=0.01)
    #lr_c = []
        
    pipe = Pipeline(steps=[
        ('decomp', svd),
        ('rf', clf_rf)
    ])
    
    # find the best combination of SVD components and number of neighbors:
    
    #params = dict(decomp__n_components = svd_components,)
    
    rf_cv_estimator = GridSearchCV(pipe, 
                                dict(
                                    decomp__n_components = svd_components,
                                    rf__n_estimators = rf_n_estimators
                                ), 
                                cv=3, verbose=2, n_jobs=2, scoring='roc_auc')
    
    rf_cv_estimator.fit(X_train, y_train_single_col)
    
    print 'SVD and RandomForestClassifier - Best parameters set found on training set:'
    print rf_cv_estimator.best_params_
    
    print "Grid scores:"
    # https://stackoverflow.com/a/42800056/2491761
    means = rf_cv_estimator.cv_results_['mean_test_score']
    stds = rf_cv_estimator.cv_results_['std_test_score']

    for mean, std, params in zip(means, stds, rf_cv_estimator.cv_results_['params']):
        print "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)
    
    pred_y = rf_cv_estimator.predict(X_test)
    proba_y = rf_cv_estimator.predict_proba(X_test)
    print_metrics(y_test_single_col, pred_y, target_names, proba_y[:,1])
    
    return rf_cv_estimator

In [49]:
rf_models_for_each_category = []
for idx, cat_name in enumerate(categories_to_use):
    target_names=('Not '+cat_name, cat_name)
    print "About to run test for: {}".format(target_names)
    model = single_category_pipeline_rf(X_train, X_test, y_train[:,idx], y_test[:,idx], target_names)
    rf_models_for_each_category.append(model)

About to run test for: (u'Not Asian Recipes', u'Asian Recipes')
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] rf__n_estimators=50, decomp__n_components=50 ....................
[CV] rf__n_estimators=50, decomp__n_components=50 ....................
[CV] ..... rf__n_estimators=50, decomp__n_components=50, total=  24.7s
[CV] rf__n_estimators=50, decomp__n_components=50 ....................
[CV] ..... rf__n_estimators=50, decomp__n_components=50, total=  25.5s
[CV] rf__n_estimators=100, decomp__n_components=50 ...................
[CV] ..... rf__n_estimators=50, decomp__n_components=50, total=  24.5s
[CV] rf__n_estimators=100, decomp__n_components=50 ...................
[CV] .... rf__n_estimators=100, decomp__n_components=50, total=  48.6s
[CV] rf__n_estimators=100, decomp__n_components=50 ...................
[CV] .... rf__n_estimators=100, decomp__n_components=50, total=  47.6s
[CV] rf__n_estimators=200, decomp__n_components=50 ...................
[CV] .... rf__n_estimato

[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 27.1min finished


SVD and RandomForestClassifier - Best parameters set found on training set:
{'rf__n_estimators': 200, 'decomp__n_components': 200}
Grid scores:
0.965 (+/-0.006) for {'rf__n_estimators': 50, 'decomp__n_components': 50}
0.966 (+/-0.005) for {'rf__n_estimators': 100, 'decomp__n_components': 50}
0.966 (+/-0.003) for {'rf__n_estimators': 200, 'decomp__n_components': 50}
0.967 (+/-0.006) for {'rf__n_estimators': 50, 'decomp__n_components': 100}
0.970 (+/-0.004) for {'rf__n_estimators': 100, 'decomp__n_components': 100}
0.972 (+/-0.004) for {'rf__n_estimators': 200, 'decomp__n_components': 100}
0.972 (+/-0.008) for {'rf__n_estimators': 50, 'decomp__n_components': 200}
0.972 (+/-0.009) for {'rf__n_estimators': 100, 'decomp__n_components': 200}
0.974 (+/-0.007) for {'rf__n_estimators': 200, 'decomp__n_components': 200}
                   precision    recall  f1-score   support

Not Asian Recipes       0.99      1.00      0.99     20557
    Asian Recipes       0.97      0.85      0.91      1709


[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 28.4min finished


SVD and RandomForestClassifier - Best parameters set found on training set:
{'rf__n_estimators': 200, 'decomp__n_components': 200}
Grid scores:
0.938 (+/-0.020) for {'rf__n_estimators': 50, 'decomp__n_components': 50}
0.944 (+/-0.019) for {'rf__n_estimators': 100, 'decomp__n_components': 50}
0.946 (+/-0.017) for {'rf__n_estimators': 200, 'decomp__n_components': 50}
0.942 (+/-0.020) for {'rf__n_estimators': 50, 'decomp__n_components': 100}
0.951 (+/-0.016) for {'rf__n_estimators': 100, 'decomp__n_components': 100}
0.951 (+/-0.017) for {'rf__n_estimators': 200, 'decomp__n_components': 100}
0.946 (+/-0.015) for {'rf__n_estimators': 50, 'decomp__n_components': 200}
0.952 (+/-0.016) for {'rf__n_estimators': 100, 'decomp__n_components': 200}
0.954 (+/-0.013) for {'rf__n_estimators': 200, 'decomp__n_components': 200}
                     precision    recall  f1-score   support

Not Italian Recipes       0.99      1.00      0.99     20984
    Italian Recipes       0.99      0.78      0.87     

[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 23.9min finished


SVD and RandomForestClassifier - Best parameters set found on training set:
{'rf__n_estimators': 200, 'decomp__n_components': 200}
Grid scores:
0.938 (+/-0.006) for {'rf__n_estimators': 50, 'decomp__n_components': 50}
0.941 (+/-0.009) for {'rf__n_estimators': 100, 'decomp__n_components': 50}
0.946 (+/-0.007) for {'rf__n_estimators': 200, 'decomp__n_components': 50}
0.942 (+/-0.004) for {'rf__n_estimators': 50, 'decomp__n_components': 100}
0.947 (+/-0.007) for {'rf__n_estimators': 100, 'decomp__n_components': 100}
0.949 (+/-0.004) for {'rf__n_estimators': 200, 'decomp__n_components': 100}
0.940 (+/-0.004) for {'rf__n_estimators': 50, 'decomp__n_components': 200}
0.948 (+/-0.005) for {'rf__n_estimators': 100, 'decomp__n_components': 200}
0.951 (+/-0.004) for {'rf__n_estimators': 200, 'decomp__n_components': 200}
                          precision    recall  f1-score   support

Not Breakfast and Brunch       0.98      1.00      0.99     20999
    Breakfast and Brunch       0.98      0.65

[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 25.6min finished


SVD and RandomForestClassifier - Best parameters set found on training set:
{'rf__n_estimators': 200, 'decomp__n_components': 200}
Grid scores:
0.943 (+/-0.004) for {'rf__n_estimators': 50, 'decomp__n_components': 50}
0.949 (+/-0.003) for {'rf__n_estimators': 100, 'decomp__n_components': 50}
0.952 (+/-0.007) for {'rf__n_estimators': 200, 'decomp__n_components': 50}
0.949 (+/-0.006) for {'rf__n_estimators': 50, 'decomp__n_components': 100}
0.954 (+/-0.009) for {'rf__n_estimators': 100, 'decomp__n_components': 100}
0.956 (+/-0.005) for {'rf__n_estimators': 200, 'decomp__n_components': 100}
0.951 (+/-0.009) for {'rf__n_estimators': 50, 'decomp__n_components': 200}
0.955 (+/-0.005) for {'rf__n_estimators': 100, 'decomp__n_components': 200}
0.961 (+/-0.006) for {'rf__n_estimators': 200, 'decomp__n_components': 200}
                     precision    recall  f1-score   support

Not Mexican Recipes       0.99      1.00      0.99     21273
    Mexican Recipes       0.98      0.77      0.86     

[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 13.9min finished


SVD and RandomForestClassifier - Best parameters set found on training set:
{'rf__n_estimators': 200, 'decomp__n_components': 100}
Grid scores:
0.986 (+/-0.002) for {'rf__n_estimators': 50, 'decomp__n_components': 50}
0.989 (+/-0.002) for {'rf__n_estimators': 100, 'decomp__n_components': 50}
0.990 (+/-0.001) for {'rf__n_estimators': 200, 'decomp__n_components': 50}
0.990 (+/-0.003) for {'rf__n_estimators': 50, 'decomp__n_components': 100}
0.992 (+/-0.002) for {'rf__n_estimators': 100, 'decomp__n_components': 100}
0.992 (+/-0.002) for {'rf__n_estimators': 200, 'decomp__n_components': 100}
0.990 (+/-0.002) for {'rf__n_estimators': 50, 'decomp__n_components': 200}
0.992 (+/-0.002) for {'rf__n_estimators': 100, 'decomp__n_components': 200}
0.992 (+/-0.003) for {'rf__n_estimators': 200, 'decomp__n_components': 200}
             precision    recall  f1-score   support

 Not Drinks       0.99      1.00      1.00     21297
     Drinks       0.95      0.85      0.90       969

avg / total      